---
title: "Replication Material for: How technological change affects regional voting patterns. Part 1: Prepare data set"
output: html_document
author: Nikolas Schöll and Thomas Kurer
editor_options: 
  chunk_output_type: inline
---
# Load packages 
```{r  1_1}
pacman::p_load(tidyverse,
               magrittr,
               readstata13, # read data from stata
               xlsx, # read excel files
               stringi # to replace umlaute
               )

parties = c("greens", "linke",  "spd", "fdp",  "cdu_csu", "authoritarian_right")
```

# #### Prepare Data sources ### 
# Elections Data 
## National elections from CSVs from BUNDESWAHLLEITER

Note:
- NPD not running in 1994
- Left Party not running in Western Germany in 1994
- AFD from 2015

source: https://www.bundeswahlleiter.de/dam/jcr/2e018ffc-0368-4c87-b85f-23dae3a5c8f5/btw2017kreis.csv
```{r 1_2}
election_list = list()
folder = "../data/election_results/national_elections/"
files = dir(folder)
for(file in dir(folder)){
  
  print(file)
  year = file %>% str_extract("(\\d)+")
  results_year = read.csv(paste0(folder,file), sep = ";")

  # Fix names
  names(results_year) = 
    map2_chr(.x = results_year[4,], .y = results_year[5,], .f = ~paste(.x, .y)) %>% 
    trimws() %>% 
    str_replace_all(pattern = " ", "_")
  
  # Fix Wahlberechtigte Typo in 2013
  if(year == "2013"){
    results_year = results_year %>% 
      rename(Wahlberechtigte = `Wahlbe-rechtigte`)
  }
  
  # Remove empty cells
  results_year = results_year[-(1:5),]
  
  results_year = results_year %>% 
    mutate(year = year %>% as.numeric(),
           election_type = "national") %>% 
  mutate_at(.vars = names(results_year[-(1:3)]), .funs = ~ as.character(.) %>%  as.numeric())
  
  
  election_list[[year]] = results_year
}
elections_national = election_list %>% map_df(~bind_rows(.)) 

#fix some var names
names(elections_national)[3] = "region_name"
names(elections_national) = names(elections_national) %>% str_remove_all("_Zweitstimmen")


elections_national = elections_national  %>%
  rename(region_code_historic = Statistische_Kennziffer) %>% 
  filter(!is.na(region_code_historic), #filter out non relevant rows
         region_code_historic != "",
         region_name %>% str_detect("Briefwahl", negate = T)) %>% 
  rename(AFD = AfD) %>% 
  mutate_at("region_code_historic", as.numeric) %>% 
  mutate(CDU_CSU = CDU + CSU,
         authoritarian_right = replace_na(AFD, 0) + replace_na(DVU, 0) + replace_na(NPD, 0) + replace_na(REP, 0),
         LINKE = replace_na(DIE_LINKE, 0) + replace_na(PDS, 0) ) %>% 
  select(region_code_historic,
         year, 
         election_type,
         voters = `Wähler`,
         greens = `GRÜNE`,
         linke = LINKE,
         spd = SPD,
         fdp = FDP,
         cdu_csu = CDU_CSU, 
         authoritarian_right
         )

rm(file, files, folder, year, results_year, election_list)
```


## EU elections from CSVs from BUNDESWAHLLEITER

Note about authoritarian-right parties:
- AFD: 2014, 2019
- NPD: all but 2009 (delegated to DVU)
- DVU: only 2009 
- Die Rechte: 2019 (recoded as DVU)


```{r  1_3}
election_list = list()
folder = "../data/election_results/eu_elections/"
for(file in dir(folder)){
 
  
  print(file)
  year = file %>% str_extract("(\\d)+")
  
  if(year <= 2004){
      results_year = read.csv(paste0(folder,file), sep = ";", skip = 5)
      names(results_year)[1] = "region_code_historic"
      
      #clean region codes
      results_year[["region_code_historic"]] = 
        results_year[["region_code_historic"]] %>% 
        str_remove_all(" ") %>% 
        as.numeric()
  
      if (year <= 1999){
        #rename FDP
        results_year = results_year %>% 
          rename(FDP = F.D.P.)
      }
      
  } else if (year <= 2014) {
    
    results_year = read.csv(paste0(folder,file), sep = ";", skip = 5)
    
    # remove first row of data and unnecessary vars
    to_exclude = (results_year[1,] %in% c("Vorperiode", "x")) 
    results_year = results_year[-1,!to_exclude]
    
    # convert results to integers, fix region_code_historic
    results_year = results_year %>% 
      mutate_at(.vars = names(results_year[4:length(results_year)]), .funs = ~ as.character(.) %>%  as.numeric()) %>% 
      rename(region_code_historic = Nr)
    
      if (year == 2014){
        #rename AfD
        results_year = results_year %>% 
          rename(AFD = AfD)
      }
    
  } else{ # year == 2019
    
      results_year = read.csv(paste0(folder,file), sep = ";", encoding = "UTF-8", skip = 2)

      # remove first two rows of data and unnecessary vars
      to_exclude = (results_year[2,]  %in% c("Vorperiode", "x.46")) %>% replace_na(F)
      results_year = results_year[-(1:2),!to_exclude]
      
      # convert to integers, fix region_code, rename DVU
      results_year = results_year %>% 
      mutate_at(.vars = names(results_year[4:length(results_year)]), .funs = ~ as.character(.) %>%  as.numeric()) %>% 
      rename(region_code_historic = Nr,
             DVU = `DIE.RECHTE...Partei.für.Volksabstimmung..Souveränität.und.Heimatschutz`)
      
      # fix party names
      names(results_year)[4:18] = c("Wahlberechtigte", "Wähler", "Ungültige", "Gültige" , "CDU" , "SPD", "GRÜNE", "DIE.LINKE", "AFD", "CSU", "FDP","FREIE.WÄHLER", "Piratenpartei.Deutschland", "PARTEI.MENSCH.UMWELT.TIERSCHUTZ", "NPD")
  }
  
  #fix region name
  names(results_year)[2] = "region_name"
  
  # add year and election type 
  results_year = results_year %>%
    mutate(year = year %>% as.numeric(),
           election_type = "eu")
  
  # store to list
  election_list[[year]] = results_year
}


#merge together
elections_eu = election_list %>% map_df(~bind_rows(.)) %>%
  filter(nchar(region_code_historic) > 3) %>% #filter out lower or higher levels
  mutate(region_code_historic = ifelse(nchar(region_code_historic) == 5,
                              region_code_historic,
                              paste0("0",region_code_historic)), #fix missing 0 at start of some region codes
         CDU_CSU = replace_na(CDU, 0) + replace_na(CSU, 0),
         authoritarian_right = replace_na(AFD, 0) + replace_na(DVU, 0) + replace_na(NPD, 0) + replace_na(REP, 0),
         LINKE = replace_na(DIE.LINKE, 0) + replace_na(PDS, 0) 
         ) %>%
  mutate_at("region_code_historic", as.numeric) %>% 
  select(region_code_historic,
         year, 
         election_type,
         voters = `Wähler`,
         greens = `GRÜNE`,
         linke = LINKE,
         spd = SPD,
         fdp = FDP,
         cdu_csu = CDU_CSU, 
         authoritarian_right
         )

rm(election_list, results_year, to_exclude, file, folder, year)
```



## State elctions

```{r 1_4}

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

folder = "../data/election_results/state_elections/"
files = dir(folder) %>% .[str_detect(., pattern = "xlsx")]
state_election_list = list()
for (state_i in 1:length(files)){
  file = files[state_i]
  state_election_list[[state_i]] = xlsx::read.xlsx(paste0(folder, file), sheetName = "cleaned", encoding = "UTF-8", colClasses = ) %>%
    rename(region_code_historic = Statistische_Kennziffer) %>% 
    mutate(authoritarian_right = AFD, 
           year = year %>% as.character() %>%  substrRight(., 4) %>% as.integer()) %>% 
    mutate_all(as.character) %>% 
    mutate_all(as.numeric) %>%
    mutate(election_type = "state") 
}

elections_state = 
  state_election_list %>% 
  bind_rows() %>% 
  filter(region_code_historic %in% 1000:20000) %>%
  select(region_code_historic,
         year, 
         election_type,
         eligible_to_vote = Wahlberechtigte,
         voters = `Wähler`,
         greens = `GRÜNE`,
         linke = LINKE,
         spd = SPD,
         fdp = FDP,
         cdu_csu = CDU_CSU, 
         authoritarian_right
         )

rm(file, files, folder, state_i, state_election_list, substrRight)
```

## Merge election data frames
```{r 1_5}
# merge and select relevant vars
elections_combined = 
  list(elections_eu, 
       elections_national,
       elections_state) %>% 
  bind_rows()


# Use only one election per Year: federal > state > eu
elections_combined %<>% 
  mutate(election_type = factor(election_type, 
                                levels = c("national", "state", "eu"))) %>% 
  arrange(region_code_historic, year, election_type) %>% 
  group_by(region_code_historic, year) %>% 
  filter(row_number() == 1)

# elections_combined %>% write_rds(file = "../data/election_results/elections_1994-2019.RDS")

#rm(elections_eu, elections_national, elections_state)
```


# Destatis
Information from German statistical office (destatis)
## Population

From Gemeindeverzeichnis
Source: https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/Archiv/GVAuszugJ/3112<<YEAR>>_Auszug_GV.xlsx?__blob=publicationFile
Insert year in link where it says <<YEAR>>
```{r 1_6, warning=FALSE}
# load yearly information of population
population = list()
for (year_i in 1994:2017){
 print(year_i)
 file = paste0("../data/destatis/Population Gemeindeverzeichnis ", year_i, ".xlsx")

 population[[paste(year_i)]] = readxl::read_excel(file, sheet = "Gemeindedaten", skip = 3) %>%
   transmute(region_code_historic = paste0(Land, RB, Kreis)%>% as.integer(),
             population = insgesamt %>% as.numeric()) %>%
   filter(!is.na(population)) %>%
   group_by(region_code_historic) %>%
   summarise(population = population  %>% sum()) %>%
   mutate(year = year_i)
}

# merge to one data frame
population %<>%
  bind_rows() 

# save
population %>%
  write_rds("../data/destatis/population.RDS")

rm(year_i, file)
```


## GDP per capita
Data from Statistische Ämter des Bundes und der Länder (link)[https://www.statistik-bw.de/VGRdL/tbls/R0B0.jsp?rev=RV2014&tbl=R2B1].
This information is provided in modern counties.
```{r 1_7}
gdp_per_capita = 
    readxl::read_excel("../data/destatis/VGRDL.xlsx", sheet = "6.", skip = 4) %>% 
    filter(`NUTS 3` == 3) %>% 
    mutate(region_code = `Regional-schlüssel` %>% as.integer(),
           region_code = ifelse(region_code < 100, region_code*1000, region_code)) %>%
    select(region_code, `1994`:`2017`) %>% 
    gather(key = "year", value = "gdp_per_capita", `1994`:`2017`) %>% 
    mutate_at(c("year", "gdp_per_capita"), as.numeric) %>% 
  # GDP per capita in thousands to get interpretable coefficients
  mutate(gdp_per_capita = gdp_per_capita / 1000)

gdp_per_capita %>% 
  write_rds("../data/destatis/gdp_per_capita.RDS")
```

# IAB

## SIAB
Import data set generated from social security data (SIAB) and create relevant variables.

```{r 1_8}
# Load 
SIAB = readstata13::read.dta13("../data/IAB/aggregated_sample.dta", 
                              nonint.factors = TRUE, 
                              convert.factors = TRUE) %>% 
  rename(state = state_code,
         individuals.siab = SIAB_n_individuals,
         foreigners.siab = SIAB_foreign) %>% 
  mutate(region_code = readstata13::read.dta13("../data/IAB/aggregated_sample.dta", 
                              convert.factors = F)$region,
         region = region %>% 
           as.character() %>% 
           str_remove_all('[[:digit:]]+') %>% 
           trimws(),
         east_west = if_else(broad_employment_region == "east", "east", "west")) %>% 
  select(-starts_with("SIAB_n_")) #remove counts 


# remove preceding part
names(SIAB) = names(SIAB) %>% map_chr(~str_remove(., "SIAB_"))

# rename and create new variables
SIAB %<>% 
  rename(emp.siab = emp,
         emp.siab_manufacturing = emp_manufacturing) %>% 
  mutate(emp.siab_non_manufacturing = emp.siab - emp.siab_manufacturing) %>% 
  # Rescale employment variables (SIAB offers 2% sample of overall population) and calculate shares in percentage points:
  mutate_at(vars(starts_with("emp"), foreigners.siab, individuals.siab), 
            ~ . * 50) %>%
  #get employment shares in percentage points:
  mutate_at(vars(starts_with("emp_")), list(share = ~ . / emp.siab * 100)) %>% 
    #calculate skill and education shares relative to all workers with information on that variable (and make 0 to missing as information for that group was missing for the given year/county, e.g. for confidentiality reasons):
  mutate_at(c("emp_lq", "emp_mq", "emp_hq"), 
            list(share = ~ . / (emp_lq + emp_mq + emp_hq) * 100),
            c("emp_lq_share", "emp_mq_share", "emp_hq_share"), 
            ifelse(. > 0, ., NA)) %>% 
  mutate_at(c("emp_edu_high", "emp_edu_low", "emp_edu_no"), 
            list(share = ~ . / (emp_edu_high + emp_edu_low + emp_edu_no) * 100),
            c("emp_edu_high_share", "emp_edu_low_share", "emp_edu_no_share"),
            ifelse(. > 0, ., NA)) %>% 
  # calculate task shares in percentage points
  mutate(task_cognitive_non_routine = (gwkomp1 + gwkomp2) * 100,
       task_cognitive_routine = gwkomp3 * 100,
       task_manual_routine = gwkomp4 * 100,
       task_manual_non_routine = gwkomp5 * 100) 


# Rescale robots, ICT and trade by employment
SIAB %<>% 
  group_by(region) %>% 
  arrange(region, year) %>% 
  mutate(robots = robot_count / first(emp.siab) * 1000,
         log_robots = log(1 + robots),
         robots_eu = robot_eu_count / first(emp.siab) * 1000,
         log_robots_eu = log(1 + robots_eu),
         ICT = ICT_total_k / first(emp.siab) * 1000,
         ICT = ifelse(year == 1994, dplyr::lead(ICT), ICT), # take 1995 value for 1994 ICT (it's missing)
         ICT_eu = ICT_eu_total_k / first(emp.siab) * 1000,
         ICT_eu = ifelse(year == 1994, dplyr::lead(ICT_eu), ICT_eu), # take 1995 value for 1994 ICT (it's missing)
         net_exports = net_exports / first(emp.siab) / 1000, #net exports in 1000 euros per worker
         ) %>% 
  ungroup()

# Keep only relevant vars and years
SIAB %<>%
  select(-broad_region,
         -contains("gwkomp"),
         -starts_with("wage"),
         -starts_with("mean_age"),
         -starts_with("main_task"),
         -starts_with("kids"),
         -starts_with("n_"),
         -ICT_k, -ICT_us_k, -ICT_eu_k, 
         -ICT_total_k, -ICT_us_total_k, -ICT_eu_total_k,
         -China_Export_k, -Eastern_Europe_Export_k,
         -China_Import_k, -Eastern_Europe_Import_k) %>% 
  filter(year %in% 1994:2017)

SIAB %>% write_rds("../data/IAB/SIAB_cleaned.Rds")
```

## Commuting zones
Source:  Kropp and Schwengler (2011) (link)(https://www.iab.de/389/section.aspx/Publikation/k110222301)

```{r 1_9}
# commuting zones
cz = readxl::read_excel("../data/IAB/commuting zones from IAB.xls", 
                        sheet = "Daten2016",
                        skip = 1) %>% 
  select(region_code = Kr,
         commuting_zones_50 = AMR_LAB)

cz %>% 
  write_rds("../data/IAB/commuting_zones.RDS")
```



# Create consistent county defitions
German counties experienced several regional reforms. Election results are according to historic county borders. We use modern county definitions as a consistent definition necessary for panel data. 
This crosswalk uses information on population weights from BBSR.

## Prepare crosswalk counties
```{r 1_10}
crosswalk = list()

for (year_i in 1990:2019){

  if (year_i <= 2015){
      df_raw = readxl::read_excel("../data/crosswalk_historic_counties/Referenzschluessel Kreise von 1990 bis 2016.xlsx", sheet = paste0(year_i,"-2016"), col_types = "text")
  }
  
  if (year_i <= 1996) {
    
    df = df_raw %>%  .[,c(1,2,4,6,7,8)] #only pick relevant vars
    names(df) = c("region_code_historic","region_name_historic", "weight","population", "region_code", "region_name") #rename them
    df %<>% 
  mutate(region_code_historic = region_code_historic %>% str_sub(1,-4) %>% ifelse(nchar(.) == 4, paste0("0",.), .), #--> each code needs 5 digits, string
         region_code = region_code %>% str_sub(1,-4) %>% as.integer(),
         weight = weight %>% as.numeric(),
         population = population %>% as.numeric() %>% round(digits = 3) * 1000,
         year = year_i)
  } else  if (year_i <= 2013) {
    df = df_raw %>%  .[,c(1,2,4,7,8,9,10)] #only pick relevant vars
    names(df) = c("region_code_historic","region_name_historic", "weight","population", "employed", "region_code", "region_name") #rename them
    df %<>% 
  mutate(region_code_historic = region_code_historic %>% str_sub(1,-4) %>% ifelse(nchar(.) == 4, paste0("0",.), .), #--> each code needs 5 digits, string
         region_code = region_code %>% str_sub(1,-4) %>% as.integer(),
         weight = weight %>% as.numeric(),
         population = population %>% as.numeric() %>% round(digits = 3) * 1000,
         employed = employed %>% as.numeric() %>% round(digits = 3) * 1000,
         year = year_i)
  } else  {
    df = df_raw %>%  .[,c(1,2,4,5,6,7)] #only pick relevant vars
    names(df) = c("region_code_historic","region_name_historic", "population", "employed", "region_code", "region_name") #rename them
    df %<>% 
  mutate(region_code_historic = region_code_historic %>% 
           str_sub(1,-4) %>% 
           ifelse(nchar(.) == 4, paste0("0",.), .), #--> each code needs 5 digits, string
         region_code = region_code %>% str_sub(1,-4) %>% as.integer(),
         weight = 1,
         population = population %>% as.numeric() %>% round(digits = 3) * 1000,
         employed = employed %>% as.numeric() %>% round(digits = 3) * 1000,
         year = year_i)
  }
  

  crosswalk[[paste(year_i)]] = df
}

#merge to one data frame
crosswalk %<>% 
  bind_rows() %>% 
  select(region_code_historic, region_code, year, weight, region_name) %>% 
  mutate_at(.vars = c("region_code_historic", "year"),
            .funs = as.integer)

# save
crosswalk %>% 
  write_rds("../data/crosswalk_historic_counties/crosswalk_historic_counties.RDS")


# #save to dta
# weights %>% 
#   haven::write_dta("../data/crosswalk_historic_counties/crosswalk_historic_counties.dta")

rm(year_i, df_raw, df) 

```

## Calculate consistent counties
County reshaping weights from BBSR (<https://www.bbsr.bund.de/>)

```{r 1_11}
# Merge together all data from historic county borders
historic_counties = 
  elections_combined %>% 
  full_join(population, by = c("region_code_historic", "year")) %>% 
  full_join(crosswalk, by = c("region_code_historic", "year")) %>% 
  arrange(region_code, year)

# aggregate to consistent counties
vars_to_reweight = c("greens", "linke", "spd",  "fdp", "cdu_csu", "authoritarian_right", "voters", "population")

consistent_counties = historic_counties %>% 
  group_by(region_code, region_name, election_type, year) %>% 
  mutate_at(vars_to_reweight, .funs = ~.*weight) %>% 
  summarise_at(vars_to_reweight, 
               .funs = sum) %>% 
  filter(year %in% 1994:2017,
         !is.na(region_code)) 


# Calculate political outcomes in percentage points
consistent_counties %<>%
  mutate_at(parties, .funs = ~ ./ voters * 100)  # calculate vote share in percentage points

rm(elections_combined, population, historic_counties, crosswalk, vars_to_reweight)
```


# Create final data sets

## Merge data sources

```{r 1_12}
df_merged = 
  SIAB %>%  
  left_join(gdp_per_capita, by = c("region_code", "year")) %>% 
  left_join(consistent_counties, by = c("region_code", "year")) %>% 
  left_join(cz, by = c("region_code")) %>%
  ungroup()

df_merged %<>% 
  mutate(
    # employment to population ratios:
    employment_population_ratio = emp.siab / population * 100,
    employment_manufacturing_population_ratio = emp.siab_manufacturing / population * 100,
    employment_non_manufacturing_population_ratio = emp.siab_non_manufacturing / population * 100)

# regional identifiers
df_merged %<>%
    # create election-year dummies:
    mutate(election = case_when(election_type == "state" ~ paste(state,year, sep = "_"), 
                                election_type == "national" ~ paste("Bundestag",year, sep = "_"),
                                election_type == "eu" ~ paste("Europa",year, sep = "_"))) %>% 
    ungroup()


df_merged %>% count(election)
```


## Save main data set

```{r 1_13}
df_merged %>% 
  filter(year %in% 1994:2017) %>% #only consider years with complete data (keep all years in df for later usage) %>% 
  select(-starts_with("base"))  %>% #remove long-diff data
  write_rds("../data/sample.RDS")
```


# Extended data set for mechanisms
```{r}
# functions
scale_this =
  function(x){
    (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
    }


# read mechanisms data from Wegweiser Kommune
wwk =
  read.csv("../data/wwk/mechanisms_data_wwk.csv") %>% 
  mutate(log_population_16_64 =  
           ifelse(is.finite(log_population_16_64), 
                  log_population_16_64, 
                  NA))


# merge with main data set
cols = names(wwk)[9:(length(wwk)-1)]

df_mechanisms =
  read_rds("../data/sample.RDS") %>% 
  select(log_robots, ICT, region,  year, region, region_code) %>% 
  merge(wwk, by=c("year", "region_code"), all.x = TRUE) %>% 
  filter(!is.na(population_16_64))

df_mechanisms %<>% 
    mutate_at(cols, scale_this)

df_mechanisms %>% 
  write_rds("../data/sample_mechanisms.RDS")
```


# Create replication dataset DFSW 2021

DFSW use a long difference approach where they compare 1994 to 2014. We replicate their data set as closely as possible.
```{r 1_14}
df_long_diff =
  df_merged %>% 
  group_by(region) %>% 
  arrange(region, year) %>% 
  filter(year %in% c(1994,2014)) %>% 
  mutate_at(c("robots", "robots_eu", "net_exports", "ICT"), ~ dplyr::lead(.) - .) %>%  #changes
  mutate_at(vars(emp_lq_share, 
                 emp_mq_share,
                 emp_hq_share,
                 emp_edu_no_share,
                 emp_edu_low_share,
                 emp_edu_high_share,
                 task_cognitive_non_routine, 
                 task_cognitive_routine, 
                 task_manual_routine,
                 task_manual_non_routine,
                 employment_population_ratio,
                 employment_manufacturing_population_ratio,
                 employment_non_manufacturing_population_ratio),
            .funs = list(change = ~ (dplyr::lead(.) - .))) %>%
  mutate_at(vars(emp.siab,
                 emp.siab_manufacturing,
                 emp.siab_non_manufacturing),
            .funs = list(growth = ~ (dplyr::lead(.) / . - 1) * 100)) %>% 
  mutate_at(vars(starts_with("base_employment_")), ~ .  / base_emp * 100)  %>%  #calculate base year shares in pp
  mutate_at(vars(emp.siab, emp.siab_manufacturing, emp.siab_non_manufacturing),
            .funs = list(population_change_base = ~ (dplyr::lead(.) - .) / population * 100)) %>% 
  ungroup() %>% 
  #impute employment shares of censored (assume that agriculture share = 0)
  mutate(base_employment_public = ifelse(is.na(base_employment_public), 
                                         100 - (base_employment_food_products + base_employment_consumer_goods + base_employment_industrial_goods + base_employment_capital_goods + base_employment_construction +  base_employment_maintenance + base_employment_services),
                                         base_employment_public),
  # calculate conditional robot growth for graph
         robots_conditional = lm(data = .,
                                 formula = robots ~	base_employment_food_products + base_employment_consumer_goods + base_employment_industrial_goods + base_employment_capital_goods + base_employment_construction + base_employment_maintenance + base_employment_services + base_employment_public + state, na.action=na.exclude) %>% residuals(),
  # create labels for Figure 2 of DFSW
        region_label = ifelse(region_code %in% c(3102, 3103, 3402, 6633, 6433, 7334, 8111, 8125, 8216, 9161, 9279, 9662),
                            region,
                            NA) %>% str_remove_all(", Stadt") %>% str_remove_all(", Landeshauptstadt")) %>%
  filter(year == 1994)

df_long_diff %>% 
  write_rds("../data/sample_replication_DFSW.RDS")

```

# Create ICT country comparison dataset
```{r 1_15}
euklems_ICT = read_rds("../data/EUKLEMS/Statistical_Capital_EUKLEMS_2019.rds") %>% 
  filter(code == "TOT",
         var %in% c("Kq_IT", "Kq_CT", "Kq_Soft_DB"),
         country %in% c("DE", "ES", "FR", "UK", "US")) %>% 
  group_by(country, year) %>% 
  summarise(ICT_stock = sum(value))

euklems_employment = read_rds("../data/EUKLEMS/Statistical_National-Accounts_EUKELMS_2019.rds") %>% 
  filter(code == "TOT",
         var == "EMPE",
         country %in% c("DE", "ES", "FR", "UK", "US")) %>% 
  group_by(country, year) %>% 
  summarise(employment = sum(value))

euklems = euklems_ICT %>% 
  left_join(euklems_employment) %>% 
  mutate(ICT = ICT_stock / employment)
  
euklems %>% 
  write_rds("../data/ICT_country_comparison.rds")
```

# Create CHES data
```{r}
#functions
normalize01 =
  function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}

weighted_mean =
  function(x, w, ..., na.rm = FALSE){
    if(na.rm){
    df_omit <- na.omit(data.frame(x, w))
    return(weighted.mean(df_omit$x, df_omit$w, ...))
  } 
  weighted.mean(x, w, ...)
}

# load and prepare CHES data
ches1 =
  read.csv("../data/CHES/1999-2014_CHES_dataset_means.csv", as.is=TRUE) %>% #load data
  filter(country=="ge") %>% select(year, party, seat, lrecon, galtan) %>% # filter for Germany
  droplevels() %>% 
  mutate(party = party %>% tolower())

ches2 =
  read.csv("../data/CHES/CHES2019V3.csv", as.is=TRUE) %>% 
  filter(country=="ge") %>% 
  select(party, lrecon, galtan) %>% 
  droplevels() %>% 
  mutate(year = 2019,
         seat = NA,
         party = party %>% tolower())

#combine and normalize
ches =
  rbind(ches1, ches2) %>% 
  mutate(galtan01 = galtan %>% as.numeric() %>% normalize01(),
         lrecon01 = lrecon %>% as.numeric() %>% normalize01())

rm(ches1, ches2)

ches$party[ches$party=="grunen"] <- "gruene"
ches$label <- paste(ches$party, ches$year, sep="_")

# party colors
ches$partycolor <- NA
ches$partycolor[ches$party=="csu"] <- "#000000"
ches$partycolor[ches$party=="cdu"] <- "#000000"
ches$partycolor[ches$party=="spd"] <- "#E3000F"
ches$partycolor[ches$party=="fdp"] <- "#FFDE00"
ches$partycolor[ches$party=="gruene"] <- "#1AA037"
ches$partycolor[ches$party=="linke"] <- "#BE3075"
ches$partycolor[ches$party=="pds"] <- "#BE3075"
ches$partycolor[ches$party=="linkspartei/pds"] <- "#BE3075"
ches$partycolor[ches$party=="afd"] <- "#009EE0"
ches$partycolor[ches$party=="rep"] <- "#009EE0"
ches$partycolor[ches$party=="dvu"] <- "#009EE0"
ches$partycolor[ches$party=="npd"] <- "#009EE0"


# only major parties
# weighted mean value by dimension
ches %<>% 
  filter(!party %in% c("dietier", "piraten")) %>%
  mutate(dim_lr_mean = weighted_mean(lrecon01, seat, na.rm=TRUE),
         dim_galtan_mean = weighted_mean(galtan01, seat, na.rm=TRUE)) %>% 
  select(party, year, dim_lr = lrecon01, dim_galtan = galtan01, dim_lr_mean, dim_galtan_mean, partycolor, label)

ches %>% 
  write_rds("../data/sample_CHES.RDS")
```

Clean up
```{r}
rm(list = ls())
print("data preperation finished!")
```


